Importing the libraries

In [ ]:
# NumPy -> To perform the Mathematical operations
import numpy as np

# Pandas -> Data Manipulation tool
import pandas as pd

# Matplotlib -> Data Visualisation tool
import matplotlib.pyplot as plt

# Seaborn -> Data Visualisation tool
import seaborn as sns

# SQLite -> Server-less Database
import sqlite3

Hints & Tips

  1. Establishing the connection to the database
In [ ]:
db = '/content/drive/MyDrive/Datasets/movies.sqlite'
conn = sqlite3.connect(db)
cur = conn.cursor()
  1. Get all the data about movies
In [ ]:
# Fetching the data of the Movies table from the Database
cur.execute("SELECT * FROM movies")
# Creating the cursor object
movies = cur.fetchall()
In [ ]:
# Creating a DataFrame
movies = pd.DataFrame(movies,columns=['id', 'original_title', 'budget', 'popularity',
                                  'release_date', 'revenue', 'title', 'vote_average',
                                  'vote_count', 'overview', 'tagline','uid',
                                  'director_id'] )
In [ ]:
# Displaying the DataFrame
movies
Out[ ]:
id Original_title Budget Popularity Release_date Revenue Title Avg_ratings Rating_count Overview Tagline uid director_id
0 43597 Avatar 237000000 150 2009-12-10 2787965087 Avatar 7.2 11800 In the 22nd century, a paraplegic Marine is di... Enter the World of Pandora. NaN 4762
1 43598 Pirates of the Caribbean: At World's End 300000000 139 2007-05-19 961000000 Pirates of the Caribbean: At World's End 6.9 4500 Captain Barbossa, long believed to be dead, ha... At the end of the world, the adventure begins. NaN 4763
2 43599 Spectre 245000000 107 2015-10-26 880674609 Spectre 6.3 4466 A cryptic message from Bond’s past sends him o... A Plan No One Escapes NaN 4764
3 43600 The Dark Knight Rises 250000000 112 2012-07-16 1084939099 The Dark Knight Rises 7.6 9106 Following the death of District Attorney Harve... The Legend Ends NaN 4765
4 43601 John Carter 260000000 43 2012-03-07 284139100 John Carter 6.1 2124 John Carter is a war-weary, former military ca... Lost in our world, found in another. NaN 4766
... ... ... ... ... ... ... ... ... ... ... ... ... ...
4768 48395 El Mariachi 220000 14 1992-09-04 2040920 El Mariachi 6.6 238 El Mariachi just wants to play his guitar and ... He didn't come looking for trouble, but troubl... NaN 5097
4769 48396 Newlyweds 9000 0 2011-12-26 0 Newlyweds 5.9 5 A newlywed couple's honeymoon is upended by th... A newlywed couple's honeymoon is upended by th... NaN 6485
4770 48397 Signed, Sealed, Delivered 0 1 2013-10-13 0 Signed, Sealed, Delivered 7.0 6 "Signed, Sealed, Delivered" introduces a dedic... None NaN 7108
4771 48398 Shanghai Calling 0 0 2012-05-03 0 Shanghai Calling 5.7 7 When ambitious New York attorney Sam is sent t... A New Yorker in Shanghai NaN 7109
4772 48399 My Date with Drew 0 1 2005-08-05 0 My Date with Drew 6.3 16 Ever since the second grade when he first saw ... None NaN 7110

4773 rows × 13 columns

In [ ]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4773 entries, 0 to 4772
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              4773 non-null   int64  
 1   Original_title  4773 non-null   object 
 2   Budget          4773 non-null   int64  
 3   Popularity      4773 non-null   int64  
 4   Release_date    4773 non-null   object 
 5   Revenue         4773 non-null   int64  
 6   Title           4773 non-null   object 
 7   Avg_ratings     4773 non-null   float64
 8   Rating_count    4773 non-null   int64  
 9   Overview        4770 non-null   object 
 10  Tagline         3951 non-null   object 
 11  UID             4773 non-null   int64  
 12  director_id     4773 non-null   int64  
dtypes: float64(1), int64(7), object(5)
memory usage: 484.9+ KB
  1. Get all the data about directors
In [ ]:
# Fetching the data of the Director table from the Database
cur.execute("SELECT * FROM directors")
# Creating the cursor object
directors = cur.fetchall()
In [ ]:
directors
In [ ]:
directors = pd.DataFrame(directors,columns=['name', 'id', 'gender', 'uid',
                                  'department'] )
In [ ]:
directors
Out[ ]:
name id gender uid department
0 James Cameron 4762 2 2710 Directing
1 Gore Verbinski 4763 2 1704 Directing
2 Sam Mendes 4764 2 39 Directing
3 Christopher Nolan 4765 2 525 Directing
4 Andrew Stanton 4766 2 7 Directing
... ... ... ... ... ...
2344 Shane Carruth 7106 2 76624 Directing
2345 Neill Dela Llana 7107 0 1174437 Directing
2346 Scott Smith 7108 0 1219158 Directing
2347 Daniel Hsia 7109 2 208138 Directing
2348 Brian Herzlinger 7110 2 85563 Directing

2349 rows × 5 columns

In [ ]:
directors.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2349 entries, 0 to 2348
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   name        2349 non-null   object
 1   id          2349 non-null   int64 
 2   gender      2349 non-null   int64 
 3   uid         2349 non-null   int64 
 4   department  2349 non-null   object
dtypes: int64(3), object(2)
memory usage: 91.9+ KB
  1. Check how many movies are present in the IMDB table
In [ ]:
cur.execute('SELECT COUNT(Title) FROM movies')
count = cur.fetchall()
print(f"The number of movies present in the IMDB databse is {count[0]}")
The number of movies present in the IMDB databse is (4773,)
  1. Find these 3 directors: James Cameron, Luc Besson, John Woo
In [ ]:
cur.execute("SELECT * FROM directors WHERE name IN ('James Cameron','Luc Besson', 'John Woo')")
cur.execute("SELECT * FROM directors WHERE name =='James Cameron' or name =='Luc Besson' or name == 'John Woo'")
three_directors = cur.fetchall()
print(f"Theses 3 Directors data are: {three_directors}")
Theses 3 Directors data are: [('James Cameron', 4762, 2, 2710, 'Directing'), ('John Woo', 4893, 2, 11401, 'Directing'), ('Luc Besson', 4949, 2, 59, 'Directing')]
  1. Find all the directors with name starting with 'Steven'
In [ ]:
cur.execute('SELECT * FROM directors WHERE name LIKE "Steven%"')
name_like = cur.fetchall()
print(f"The directors whose names are starting with the word 'Steven' are: {name_like}")
The directors whose names are starting with the word 'Steven' are: [('Steven Spielberg', 4799, 2, 488, 'Directing'), ('Steven Soderbergh', 4909, 2, 1884, 'Directing'), ('Steven Brill', 5013, 2, 32593, 'Directing'), ('Steven Zaillian', 5117, 2, 2260, 'Directing'), ('Steven Quale', 5216, 2, 93214, 'Directing'), ('Steven Seagal', 5221, 2, 23880, 'Directing'), ('Steven E. de Souza', 5390, 2, 1726, 'Directing'), ('Steven Shainberg', 5803, 2, 67795, 'Directing'), ('Steven R. Monroe', 6713, 2, 88039, 'Directing')]
  1. Count the Female directors
In [ ]:
cur.execute("SELECT COUNT(*) FROM directors WHERE gender == '1'")
females = cur.fetchall()
print(f"The number of female directors is {females[0][0]}")
The number of female directors is 150
  1. Find the name of the 10th first women directors
In [ ]:
cur.execute('SELECT name FROM directors WHERE gender==1')
tenth = cur.fetchall()
print(f"The tenth first women is {tenth[9][0]}")
The tenth first women is Angelina Jolie
  1. What are the 3 most popular movies?
In [ ]:
cur.execute('SELECT title FROM movies ORDER BY popularity DESC LIMIT 3 ')
most_popular = cur.fetchall()
print(f"The 3 mostpopular movies are: {most_popular[0][0]}, {most_popular[1][0]} and {most_popular[2][0]}")
The 3 mostpopular movies are: Minions, Interstellar and Deadpool
  1. What are the 3 most bankable movies?
In [ ]:
cur.execute('SELECT title FROM movies ORDER BY budget DESC LIMIT 3')
most_bankable = cur.fetchall()
most_bankable
print(f"The three most bankable movies are {most_bankable[0][0]}, {most_bankable[1][0]} and {most_bankable[2][0]}")
The three most bankable movies are Pirates of the Caribbean: On Stranger Tides, Pirates of the Caribbean: At World's End and Avengers: Age of Ultron
  1. What is the most awarded average rated movie since the Jan 1st, 2000?
In [ ]:
cur.execute("SELECT Original_title FROM movies WHERE Release_date	 > '2000-01-01' ORDER BY vote_average DESC LIMIT 1")
most_awarded_avg = cur.fetchall()
print(f"The most awarded average rated movie is {most_awarded_avg[0][0]}")
The most awarded average rated movie is Sardaarji
  1. Which movie(s) were directed by Brenda Chapman?
In [ ]:
cur.execute("SELECT original_title FROM movies JOIN directors ON directors.id = movies.director_id WHERE directors.name = 'Brenda Chapman'")
directed_by = cur.fetchall()
print(f"The movie(s) directed by Brenda Chapman is {directed_by[0][0]}")
The movie(s) directed by Brenda Chapman is Brave
  1. Name the director who has made the most movies?
In [ ]:
cur.execute("SELECT name FROM directors JOIN movies ON directors.id = movies.director_id GROUP BY director_id ORDER BY COUNT(name) DESC limit 1")
director_movie = cur.fetchall()
print(f"The director who made the most movies is {director_movie[0][0]}")
The director who made the most movies is Steven Spielberg
  1. Name of the director who is most bankable
In [ ]:
cur.execute("SELECT name FROM directors JOIN movies ON directors.id = movies.director_id GROUP BY director_id ORDER BY SUM(budget) DESC limit 1")
most_bankable = cur.fetchall()
print(f'The most bankable director is {most_bankable[0][0]}')
The most bankable director is Steven Spielberg

Budget Analysis

  1. Tell the Top 10 highest budget making movie
In [ ]:
movies.columns
Out[ ]:
Index(['id', 'original_title', 'budget', 'popularity', 'release_date',
       'revenue', 'title', 'vote_average', 'vote_count', 'overview', 'tagline',
       'uid', 'director_id'],
      dtype='object')
In [ ]:
cur.execute('Select * FROM movies ORDER BY budget DESC LIMIT 10')
top_10 = cur.fetchall()
most_popular = pd.DataFrame(top_10, columns=['id', 'original_title', 'budget', 'popularity', 'release_date',
       'revenue', 'title', 'vote_average', 'vote_count', 'overview', 'tagline',
       'uid', 'director_id'])
most_popular
Out[ ]:
id original_title budget popularity release_date revenue title vote_average vote_count overview tagline uid director_id
0 43614 Pirates of the Caribbean: On Stranger Tides 380000000 135 2011-05-14 1045713802 Pirates of the Caribbean: On Stranger Tides 6.4 4948 Captain Jack Sparrow crosses paths with a woma... Live Forever Or Die Trying. 1865 4775
1 43598 Pirates of the Caribbean: At World's End 300000000 139 2007-05-19 961000000 Pirates of the Caribbean: At World's End 6.9 4500 Captain Barbossa, long believed to be dead, ha... At the end of the world, the adventure begins. 285 4763
2 43604 Avengers: Age of Ultron 280000000 134 2015-04-22 1405403694 Avengers: Age of Ultron 7.3 6767 When Tony Stark tries to jumpstart a dormant p... A New Age Has Come. 99861 4769
3 43607 Superman Returns 270000000 57 2006-06-28 391081192 Superman Returns 5.4 1400 Superman returns to discover his 5-year absenc... None 1452 4772
4 43601 John Carter 260000000 43 2012-03-07 284139100 John Carter 6.1 2124 John Carter is a war-weary, former military ca... Lost in our world, found in another. 49529 4766
5 43603 Tangled 260000000 48 2010-11-24 591794936 Tangled 7.4 3330 When the kingdom's most wanted-and most charmi... They're taking adventure to new lengths. 38757 4768
6 43602 Spider-Man 3 258000000 115 2007-05-01 890871626 Spider-Man 3 5.9 3576 The seemingly invincible Spider-Man goes up ag... The battle within. 559 4767
7 43610 The Lone Ranger 255000000 49 2013-07-03 89289910 The Lone Ranger 5.9 2311 The Texas Rangers chase down a gang of outlaws... Never Take Off the Mask 57201 4763
8 43600 The Dark Knight Rises 250000000 112 2012-07-16 1084939099 The Dark Knight Rises 7.6 9106 Following the death of District Attorney Harve... The Legend Ends 49026 4765
9 43605 Harry Potter and the Half-Blood Prince 250000000 98 2009-07-07 933959197 Harry Potter and the Half-Blood Prince 7.4 5293 As Harry begins his sixth year at Hogwarts, he... Dark Secrets Revealed 767 4770

Revenue Analysis

  1. Find Top 10 Revenue making movies
In [ ]:
cur.execute("SELECT * FROM movies ORDER BY revenue DESC LIMIT 10")
top10_movies = cur.fetchall()
most_revenue = pd.DataFrame(top10_movies,  columns= ['id','original_title','budget','popularity','release_date',
                                    'revenue', 'title','vote_average','vote_count','overview',
                                    'tagline','uid','director_id'])
most_revenue
Out[ ]:
id original_title budget popularity release_date revenue title vote_average vote_count overview tagline uid director_id
0 43597 Avatar 237000000 150 2009-12-10 2787965087 Avatar 7.2 11800 In the 22nd century, a paraplegic Marine is di... Enter the World of Pandora. 19995 4762
1 43622 Titanic 200000000 100 1997-11-18 1845034188 Titanic 7.5 7562 84 years later, a 101-year-old woman named Ros... Nothing on Earth could come between them. 597 4762
2 43613 The Avengers 220000000 144 2012-04-25 1519557910 The Avengers 7.4 11776 When an unexpected enemy emerges and threatens... Some assembly required. 24428 4769
3 43625 Jurassic World 150000000 418 2015-06-09 1513528810 Jurassic World 6.5 8662 Twenty-two years after the events of Jurassic ... The park is open. 135397 4783
4 43641 Furious 7 190000000 102 2015-04-01 1506249360 Furious 7 7.3 4176 Deckard Shaw seeks revenge against Dominic Tor... Vengeance Hits Home 168259 4794
5 43604 Avengers: Age of Ultron 280000000 134 2015-04-22 1405403694 Avengers: Age of Ultron 7.3 6767 When Tony Stark tries to jumpstart a dormant p... A New Age Has Come. 99861 4769
6 43721 Frozen 150000000 165 2013-11-27 1274219009 Frozen 7.3 5295 Young princess Anna of Arendelle dreams about ... Only the act of true love will thaw a frozen h... 109445 4844
7 43628 Iron Man 3 200000000 77 2013-04-18 1215439994 Iron Man 3 6.8 8806 When Tony Stark's world is torn apart by a for... Unleash the power behind the armor. 68721 4784
8 44143 Minions 74000000 875 2015-06-17 1156730962 Minions 6.4 4571 Minions Stuart, Kevin and Bob are recruited by... Before Gru, they had a history of bad bosses 211672 5045
9 43623 Captain America: Civil War 250000000 198 2016-04-27 1153304495 Captain America: Civil War 7.1 7241 Following the events of Age of Ultron, the col... Divided We Fall 271110 4781

Voting Analysis

  1. Find the most popular movies with highest vote_average
In [ ]:
cur.execute(("SELECT * FROM movies ORDER BY vote_average DESC LIMIT 10"))
most_pop = cur.fetchall() 
most_popular_movie = pd.DataFrame(most_pop,columns =['id', 'original_title', 'budget', 'popularity', 'release_date',
       'revenue', 'title', 'vote_average', 'vote_count', 'overview', 'tagline',
       'uid', 'director_id'])
most_popular_movie
Out[ ]:
id original_title budget popularity release_date revenue title vote_average vote_count overview tagline uid director_id
0 47116 Stiff Upper Lips 0 0 1998-06-12 0 Stiff Upper Lips 10.0 1 Stiff Upper Lips is a broad parody of British ... None 89861 6332
1 47642 Dancer, Texas Pop. 81 0 0 1998-05-01 565592 Dancer, Texas Pop. 81 10.0 1 Four guys, best friends, have grown up togethe... in the middle of nowhere they had everything 78373 5463
2 47589 Sardaarji 0 0 2015-06-26 0 Sardaarji 9.5 2 A ghost hunter uses bottles to capture trouble... None 346081 6588
3 45983 One Man's Hero 0 0 1999-08-02 0 One Man's Hero 9.3 2 One Man's Hero tells the little-known story of... One man's hero is another man's traitor. 69848 5770
4 45478 The Shawshank Redemption 25000000 136 1994-09-23 28341469 The Shawshank Redemption 8.5 8205 Framed in the 1940s for the double murder of h... Fear can hold you prisoner. Hope can set you f... 278 5051
5 46567 There Goes My Baby 10500000 0 1994-09-02 123509 There Goes My Baby 8.5 2 A group of high school seniors meets in the su... None 88641 6051
6 46393 The Prisoner of Zenda 0 4 1937-09-03 0 The Prisoner of Zenda 8.4 11 An Englishman on a Ruritarian holiday must imp... The most thrilling swordfight ever filmed... 43867 5972
7 46934 The Godfather 6000000 143 1972-03-14 245066411 The Godfather 8.4 5893 Spanning the years 1945 to 1955, a chronicle o... An offer you can't refuse. 238 5179
8 44259 Fight Club 63000000 146 1999-10-15 100853753 Fight Club 8.3 9413 A ticking-time-bomb insomniac and a slippery s... Mischief. Mayhem. Soap. 550 4829
9 45415 Schindler's List 22000000 104 1993-11-29 321365567 Schindler's List 8.3 4329 The true story of how businessman Oskar Schind... Whoever saves one life, saves the world entire. 424 4799

Director Analysis

  1. Name all the directors with the number of movies and revenue where Revenue should be taken into account for doing the analysis. The director who has the highest revenue should comes at the top and so on and so forth.
In [ ]:
cur.execute("SELECT name,COUNT(*) AS 'Total Movies',SUM(revenue) AS 'Total Revenue' FROM  directors JOIN movies WHERE directors.id==movies.director_id GROUP BY director_id ORDER BY SUM(revenue) DESC")
director_revenue=cur.fetchall()
director_most_revenue=pd.DataFrame(director_revenue,columns=['Director_Name','Total Movies','Total Revenue'])
director_most_revenue.head(10)
Out[ ]:
Director_Name Total Movies Total Revenue
0 Steven Spielberg 27 9147393164
1 Peter Jackson 9 6498642820
2 James Cameron 7 5883569439
3 Michael Bay 12 5832524638
4 Christopher Nolan 8 4227483234
5 Chris Columbus 11 3725631503
6 Robert Zemeckis 13 3590622002
7 George Lucas 5 3339113893
8 Tim Burton 14 3337418241
9 Ridley Scott 16 3189557997
  1. Name all the directors with the number of movies and revenue where number of movies should be taken into account for doing the analysis. The director who has the highest number of movies should comes at the top and so on and so forth.
In [ ]:
cur.execute("SELECT name, COUNT(title), SUM(revenue) FROM directors JOIN movies ON movies.director_id = directors.id GROUP by director_id ORDER BY  COUNT(title) DESC LIMIT 10")
director_movies = cur.fetchall()
director_most_movies = pd.DataFrame(director_movies,columns=['name','no_of_title','revenue'])
director_most_movies
Out[ ]:
name no_of_title revenue
0 Steven Spielberg 27 9147393164
1 Woody Allen 21 669101038
2 Martin Scorsese 20 1956635998
3 Clint Eastwood 20 2512058888
4 Ridley Scott 16 3189557997
5 Robert Rodriguez 16 1109899581
6 Spike Lee 16 340618771
7 Steven Soderbergh 15 2114864443
8 Renny Harlin 15 1031500590
9 Tim Burton 14 3337418241
  1. Give the Title of the movie, realease_date, budget, revenue, popularity and vote_average made by Steven Spielberg
In [ ]:
cur.execute("SELECT title, release_date,budget,revenue,popularity,vote_average FROM directors JOIN movies ON directors.id==movies.director_id WHERE directors.name=='Steven Spielberg'")
movies_list=cur.fetchall()
movies_list_Steven_Spielberg=pd.DataFrame(movies_list,columns=['Movie_Name','Release_Date','Total Budget','Total_Revenue','Popularity','Vote_Average'])
movies_list_Steven_Spielberg
Out[ ]:
Movie_Name Release_Date Total Budget Total_Revenue Popularity Vote_Average
0 Indiana Jones and the Kingdom of the Crystal S... 2008-05-21 185000000 786636033 75 5.7
1 The BFG 2016-06-01 140000000 183345589 44 6.0
2 War of the Worlds 2005-06-28 132000000 591739379 48 6.2
3 The Adventures of Tintin 2011-10-25 130000000 371940071 89 6.7
4 Minority Report 2002-06-20 102000000 358372926 65 7.1
5 A.I. Artificial Intelligence 2001-06-29 100000000 235926552 34 6.8
6 The Lost World: Jurassic Park 1997-05-23 73000000 229074524 2 6.2
7 The Terminal 2004-06-17 60000000 219417255 57 7.0
8 Munich 2005-12-22 70000000 130358911 29 6.9
9 Hook 1991-12-11 70000000 300854823 33 6.6
10 War Horse 2011-12-25 66000000 177584879 29 7.0
11 Saving Private Ryan 1998-07-24 70000000 481840909 76 7.9
12 Lincoln 2012-11-09 65000000 275293450 36 6.7
13 Jurassic Park 1993-06-11 63000000 920100000 40 7.6
14 Catch Me If You Can 2002-12-25 52000000 352114312 73 7.7
15 Indiana Jones and the Last Crusade 1989-05-24 48000000 474171806 80 7.6
16 Bridge of Spies 2015-10-15 40000000 165478348 48 7.2
17 Amistad 1997-12-03 36000000 74000000 3 6.8
18 1941 1979-12-13 35000000 31755742 10 5.6
19 Indiana Jones and the Temple of Doom 1984-05-23 28000000 333000000 66 7.1
20 Schindler's List 1993-11-29 22000000 321365567 104 8.3
21 Raiders of the Lost Ark 1981-06-12 18000000 389925971 68 7.7
22 Close Encounters of the Third Kind 1977-11-16 20000000 303788635 52 7.2
23 The Color Purple 1985-12-18 15000000 146292009 17 7.7
24 Jaws 1975-06-18 7000000 470654000 50 7.5
25 E.T. the Extra-Terrestrial 1982-04-03 10500000 792910554 56 7.3
26 Twilight Zone: The Movie 1983-06-24 10000000 29450919 12 6.2